#root.dir <- here::here()
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  #root.dir = root.dir
  fig.height = 6,
  fig.width = 7.00787 #178mm
)  
knitr::opts_knit$set(#root.dir = root.dir, 
                     dpi = 350)  

library(data.table)
library(ggplot2)
library(cowplot)
library(ggplot2)
library(viridis)
library(dplyr)
library(genomation)
## Warning: replacing previous import 'Biostrings::pattern' by 'grid::pattern' when
## loading 'genomation'
library(biomaRt)
library(ggpubr)

Chicane Results & Preprocessing

The results from Chicane do contain NA values (we have queried with the authors why this occured and are awaiting a response), we first remove these and also add an adjusted p-value groupings for our interactions for later use:

res <- 
    fread("/rds/general/project/neurogenomics-lab/live/Projects/radicl_seq/data/chicane.results.rdcl.txt")
#50k NA's are removed
res <- res[!is.na(p.value)]
#make bins split res by q.value bins - 
#go with 0.05 separate as relatively high num and so you get q.val<0.05
#split rest by .2
res[,q_bin:=cut(q.value,breaks=c(0,0.05,0.2,0.4,0.6,0.8,1))]
#get gene names added on to results
baits <- 
    fread("/rds/general/project/neurogenomics-lab/live/Projects/radicl_seq/data/all_genes_location.txt",
            header = FALSE,stringsAsFactors = FALSE)
colnames(baits) <- c("chr","start","end","bait_name")    
res[,bait_name := baits$bait_name[match(res$bait.start,baits$start)]]

Distance of Interactions from Transcriptional Start Site

To validate the Chicane model for use with RADICL-Seq data, we hypothesised that there would be a build up of interacitons found close to the Transcriptional Start Site of the RNA in question. Since this is more likely to occur due to random chance, we hoped Chicane would take this into account and that the significant interactions would not be all in this region.

NOTE - Since this analysis is based on distance, trans interactions which account for 88.4% of all interactions, could not be included.

First, prepare the data for plotting:

#need to split out separate row for each count in count column to get density plot
#remove trans interactions....
res_stretch_cis <- res[!is.na(distance),.(count2=1:count),names(res)]

Now we can plot. Since we plot the distance on a Log 10 scale, we have added lines at 60Kb and 300Kb to make the plots easier to interpret.

ggplot(res_stretch_cis, aes(log(distance,base=10))) +
    geom_density(alpha = 0.1)+theme_cowplot()+
    scale_fill_viridis(name="Adj. P-value Bin",discrete=T)+
    scale_colour_viridis(name="Adj. P-value Bin",discrete=T)+
    geom_vline(xintercept=log(60000,base=10), colour="grey") +
    geom_vline(xintercept=log(300000,base=10), colour="grey") +
    annotate(x=log(60000,base=10),y=2,label="60Kb",vjust=2,geom="label")+
    annotate(x=log(300000,base=10),y=2,label="300Kb",vjust=2,geom="label")+
    xlab("Log 10 Distance from Interaction to RNA")

As can be seen, there is an initial peak of interactions close to the TSS. This proves that, as expected, we observe a pile up of interactions at the TSS.There are also two other, smaller peaks in the number of interactions at further distances.

We ideally want a model which can account for this pile up of interactions which is more likely due to random chance. We can check if Chicane has accounted for this by splitting the data based on the interactions’ adjusted p-value scores:

#split by groups
ggplot(res_stretch_cis, aes(log(distance,base=10), 
                                fill = q_bin, colour = q_bin)) +
    geom_density(alpha = 0.1)+theme_cowplot()+
    scale_fill_viridis(name="Adj. P-value Bin",discrete=T)+
    scale_colour_viridis(name="Adj. P-value Bin",discrete=T)+
    geom_vline(xintercept=log(60000,base=10), colour="grey") +
    geom_vline(xintercept=log(300000,base=10), colour="grey") +
    annotate(x=log(60000,base=10),y=2,label="60Kb",vjust=2,geom="label")+
    annotate(x=log(300000,base=10),y=2,label="300Kb",vjust=2,geom="label")+
    xlab("Log 10 Distance from Interaction to RNA")

NOTE - A significant proportion of all interactions are by MALAT1 and NEAT1 which can skew the data (discused in coding vs non-coding section). Thus all plots will also be shown excluding interactions from these RNA.

#Exlcuding 
#split by groups
ggplot(res_stretch_cis[!(bait_name %in% c("MALAT1","NEAT1"))], 
        aes(log(distance,base=10), fill = q_bin, colour = q_bin)) +
    geom_density(alpha = 0.1)+theme_cowplot()+
    scale_fill_viridis(name="Adj. P-value Bin",discrete=T)+
    scale_colour_viridis(name="Adj. P-value Bin",discrete=T)+
    geom_vline(xintercept=log(60000,base=10), colour="grey") +
    geom_vline(xintercept=log(300000,base=10), colour="grey") +
    annotate(x=log(60000,base=10),y=2,label="60Kb",vjust=2,geom="label")+
    annotate(x=log(300000,base=10),y=2,label="300Kb",vjust=2,geom="label")+
    xlab("Log 10 Distance from Interaction to RNA")

Excluding MALAT1 and NEAT1 has no effect.

The significant (Adj p-value<0.05) interactions have a peak in interactions further away from the TSS, showing Chicane does account for interactions which are more likely due to random chance with fewer of those that are close to the TSS being significant.

Finally, we hypothesised that the length of interacting RNA could play a role in the distance of the interaction from the TSS. To investigate this, we first have to standardise the base pair distance by the RNA length before replotting:

res_stretch_cis[,bait.length:=bait.end-bait.start]
res_stretch_cis[,stnd_dist:=distance/bait.length]

ggplot(res_stretch_cis, aes(log(stnd_dist,base=10))) +
    geom_density(alpha = 0.1)+theme_cowplot()+
    scale_fill_viridis(name="Adj. P-value Bin",discrete=T)+
    scale_colour_viridis(name="Adj. P-value Bin",discrete=T)+
    geom_vline(xintercept=log(1,base=10), colour="grey") +
    geom_vline(xintercept=log(120,base=10), colour="grey") +
    annotate(x=log(3,base=10),y=.8,label="RNA Length",vjust=2,geom="label")+
    annotate(x=log(400,base=10),y=.8,label="120x RNA Length",vjust=2,
             geom="label")+
    xlab("Log 10 Distance from Interaction to RNA of RNA Length")

Taking into account RNA length leaves just two peaks of the number interactions, one occuring where the distance is within the length of the RNA in question and another at a distance over 120x the length of the RNA away.

How does controlling for RNA length affect the significant interaction counts:

ggplot(res_stretch_cis, aes(log(stnd_dist,base=10), 
                            fill = q_bin, colour = q_bin)) +
    geom_density(alpha = 0.1)+theme_cowplot()+
    scale_fill_viridis(name="Adj. P-value Bin",discrete=T)+
    scale_colour_viridis(name="Adj. P-value Bin",discrete=T)+
    geom_vline(xintercept=log(1,base=10), colour="grey") +
    geom_vline(xintercept=log(120,base=10), colour="grey") +
    annotate(x=log(3,base=10),y=1.8,label="RNA Length",vjust=2,geom="label")+
    annotate(x=log(400,base=10),y=1.8,label="120x RNA Length",vjust=2,
                geom="label")+
    xlab("Log 10 Distance from Interaction to RNA of RNA Length")

And excluding MALAT1 and NEAT1:

ggplot(res_stretch_cis[!(bait_name %in% c("MALAT1","NEAT1"))], 
        aes(log(stnd_dist,base=10),fill = q_bin, colour = q_bin)) +
    geom_density(alpha = 0.1)+theme_cowplot()+
    scale_fill_viridis(name="Adj. P-value Bin",discrete=T)+
    scale_colour_viridis(name="Adj. P-value Bin",discrete=T)+
    geom_vline(xintercept=log(1,base=10), colour="grey") +
    geom_vline(xintercept=log(120,base=10), colour="grey") +
    annotate(x=log(3,base=10),y=1.8,label="RNA Length",vjust=2,geom="label")+
    annotate(x=log(400,base=10),y=1.8,label="120x RNA Length",vjust=2,
                geom="label")+
    xlab("Log 10 Distance from Interaction to RNA of RNA Length")

Excluding MALAT1 and NEAT1 has no effect.

The vast majority of cis significant interactions occur at a distance from the TSS within the length of the RNA. This is interesting as the more distant peak in the significant interactions is removed when you control for RNA length.

This made us question if there is a relationship between the adjusted p-value and the RNA length. We would hope that there wouldn’t be a strong relationship as, if there was, it may indicate that Chicane which is meant to be used for DNA-DNA interaction data (e.g. Capture Hi-C) may not be suitable for RNA-DNA interaction data (RADICL-Seq). DNA-DNA interaction dataset models may not need to control for the length of the DNA that binds since this wouldn’t fluctuate as much as RNA length in RNA-DNA interactions. To test this we performed a simple non-parametric correlation test on the cis interactions:

#is there a relationship between RNA length and q-value?
cor.test(res_stretch_cis$bait.length,res_stretch_cis$q.value,
            method = "spearman",exact = F)
#> 
#>  Spearman's rank correlation rho
#> 
#> data:  res_stretch_cis$bait.length and res_stretch_cis$q.value
#> S = 1.2863e+19, p-value < 2.2e-16
#> alternative hypothesis: true rho is not equal to 0
#> sample estimates:
#>       rho 
#> 0.1443764

And excluding MALAT1 and NEAT1

#is there a relationship between RNA length and q-value?
cor.test(res_stretch_cis[!(bait_name %in% c("MALAT1","NEAT1"))]$bait.length,
          res_stretch_cis[!(bait_name %in% c("MALAT1","NEAT1"))]$q.value,
          method = "spearman",exact = F)
#> 
#>  Spearman's rank correlation rho
#> 
#> data:  res_stretch_cis[!(bait_name %in% c("MALAT1", "NEAT1"))]$bait.length and res_stretch_cis[!(bait_name %in% c("MALAT1", "NEAT1"))]$q.value
#> S = 1.2692e+19, p-value < 2.2e-16
#> alternative hypothesis: true rho is not equal to 0
#> sample estimates:
#>       rho 
#> 0.1312633

Excluding MALAT1 and NEAT1 has no effect.

This correlation test actually showed the opposite, an increase in RNA length was related to an increase in adjusted p-value. This proved that the significant interactions produced by Chicane were not driven by the RNA length and thus increased our confidence int he use of the model for this data type.

ChromHMM Chromatin States where Interactions Occur

ChromHMM reference datasets can be used to infer the differing regulatory sections of the genome e.g. active Transcriptional Start Sites. We wanted to investiage in which chromatin states, the interactions found by RADICL-Seq were more frequently found. The RADICL-Seq data was from NiGa cells from differentiated human adipose-derived stem cells thus we used the Adipose Derived Mesenchymal Stem Cell Cultured Cells (hg38) E025 reference map from Roadmap. Matching the cell type ensured no differing, celltype specific epigenetic features.

The reference map can be loaded as follows:

#E025 Adipose Derived Mesenchymal Stem Cell Cultured Cells - hg38
#from: https://egg2.wustl.edu/roadmap/data/byFileType/chromhmmSegmentations/ChmmModels/coreMarks/jointModel/final/
#This bed file only contains one metadata column so genomation::readBed will fail
#Approach below uses code from in this function
chrHMM_url <- 
    "https://egg2.wustl.edu/roadmap/data/byFileType/chromhmmSegmentations/ChmmModels/coreMarks/jointModel/final/E025_15_coreMarks_hg38lift_segments.bed.gz"
file <- genomation:::compressedAndUrl2temp(chrHMM_url)
#read in as dataframe
df<-readr::read_delim(file,skip=0,col_names=FALSE, delim="\t")
names(df) <- c("chr","start","end","ChromHMM") 
#make GRanges obj
g = GenomicRanges::makeGRangesFromDataFrame(
    df, 
    keep.extra.columns=FALSE, 
    starts.in.df.are.0based=FALSE,
    ignore.strand=is.null(NULL))
col.names1=list(chr=1,start=2,end=3,strand=NULL)
black.names=c("seqnames", "ranges", "strand", "seqlevels", "seqlengths",
              "isCircular", "start", "end", "width", "element")
my.mcols = df[,-unlist(col.names1),drop=FALSE]
#add meta data column
GenomicRanges::mcols(g) = my.mcols[, !colnames(my.mcols) %in% black.names]
#split GRange object by the different names
chrHMM_list <- GenomicRanges::split(g, g$ChromHMM, drop = TRUE)
#get state names for IDs
sort(unique(df$ChromHMM))

The 15 states associated with the reference map are discussed here but in short are:

  • 1_TssA active transcription start site (TSS) -proximal promoter states
  • 2_TssAFlnk Flanking active transcription start site (TSS). -proximal promoter states
  • 3_TxFlnk a transcribed state at the 5′ and 3′ end of genes -showing both promoter and enhancer signatures
  • 4_Tx Strong transcription
  • 5_TxWk Weak transcription
  • 6_EnhG Genic enhancer -genic enhancers occur more frequently in gene bodies & in exons, while enhancers don’t
  • 7_Enh enhancers
  • 8_ZNF/Rpts zinc finger protein genes and repeats. -ZNFs are involved in the regulation of several cellular processes (v abundant)
  • 9_Het heterochromatin. - tightly packed DNA, inactive states
  • 10_TssBiv bivalent/poised transcription start site. -both activation and repression (low expression), poised to express
  • 11_BivFlnk flanking bivalent transcription start site/enhancer
  • 12_EnhBiv bivalent enhancer
  • 13_ReprPC repressed Polycomb. -gene silencing
  • 14_ReprPCWk weak repressed Polycomb
  • 15_Quies quiescent/low - inactivity/dormitary

Roughly, states 1-7 are active transcriptional sites while 8-15 are inactive or poised.

We need to preprocess and convert our Chicane results into a GRange object to compare overlapping regions with our reference map. We want to split the results based on the adjusted p-value groups were created earlier:

setnames(res,c("target.chr","target.start","target.end"),
            c("chr","start","end"))
            #make GRanges
gres = GenomicRanges::makeGRangesFromDataFrame(
    res, 
    keep.extra.columns=TRUE, 
    starts.in.df.are.0based=FALSE,
    ignore.strand=TRUE)
#now split based on q-value bins
gres_list <- GenomicRanges::split(gres, gres$q_bin, drop = TRUE)
rm(gres)

Now we can make an annotation object using genomation which inspects the regions of the genome where interactions were found:

annotation <- 
    genomation::annotateWithFeatures(gres_list, chrHMM_list)

Next we can get the overlap percentages of the interactions to each of the different states.

Note that this won’t add up to 100% for each adjusted p-value group since it measures the proportion of the DNA segment from the interaction that overlaps a specific state. A single interaction could be split across states, for example with 20% of its base pairs in a active TSS (state 1) and 70% of its base pairs in a flanking active TSS (state 2) and the remaining 10% in a unmarked region of DNA.

#don't produce heatmap, take matrix, standardise and replot
annot_mtrx <- genomation::heatTargetAnnotation(annotation,plot=FALSE)
#change column order
col.order <- paste0("E",1:15)
annot_mtrx <- annot_mtrx[,col.order]

Now we can plot and see where our interactions lie:

#plot raw first
#convert to df
annot_df<-reshape2::melt(annot_mtrx)
names(annot_df) <- c("q-value","State","Overlap")
#Add better explaining state names
annot_df <- annot_df%>% 
    mutate(State_name = as.factor(case_when(
        .$State == "E1" ~ "1.Active.TSS",
        .$State == "E2" ~ "2.Flanking.Active.TSS",
        .$State == "E3" ~ "3.Flanking.Transcribed.State",
        .$State == "E4" ~ "4.Strong.Transcription",
        .$State == "E5" ~ "5.Weak.Transcription",
        .$State == "E6" ~ "6.Genic.Enhancer",
        .$State == "E7" ~ "7.Enhancer",
        .$State == "E8" ~ "8.ZNC.Finger.Prot.&.Repeats",
        .$State == "E9" ~ "9.Heterochromatin",
        .$State == "E10" ~ "10.Bivalent/Poised.TSS",
        .$State == "E11" ~ "11.Flanking.Bivalent.TSS",
        .$State == "E12" ~ "12.Bivalent.Enhancer",
        .$State == "E13" ~ "13.Repressed.Polycomb",
        .$State == "E14" ~ "14.Weak.Repressed.Polycomb",
        .$State == "E15" ~ "15.Quiescent")))

# Reorder factor levels for plot
annot_df$State_name <- 
    factor(annot_df$State_name,
            levels = levels(annot_df$State_name)[c(1,8:15,2:7)])
ggplot(annot_df,aes(x=State_name,y=`q-value`,fill=Overlap))+
    geom_tile()+theme_cowplot()+scale_fill_viridis_c()+
    theme(axis.text.x = element_text(angle = 45, hjust = 1))+
    xlab("ChromHMM State")+
    ylab("Adj. P-value Bin")

It is interesting that the majority of the regions where the interactions were found are related to active transcription. This holds across all ajusted p-value groups. This highlights that genom-wide RNA interactions may regulate transcription (NB Need to discuss this further….).

To investigate if there was any difference in the states where the significant interactions (adj. p-value<0.05) were more likely to be found, we can standardise the overlap percentage for each state and replot:

#try standardising across states
scl_annot_mtrx <- apply(annot_mtrx,2,function(x) scale(x)[,1])
#plot
scl_annot_df<-reshape2::melt(scl_annot_mtrx)
names(scl_annot_df) <- c("q-value","State","Overlap")
#Add better explaining state names
scl_annot_df <- scl_annot_df%>% 
    mutate(State_name = as.factor(case_when(
        .$State == "E1" ~ "1.Active.TSS",
        .$State == "E2" ~ "2.Flanking.Active.TSS",
        .$State == "E3" ~ "3.Flanking.Transcribed.State",
        .$State == "E4" ~ "4.Strong.Transcription",
        .$State == "E5" ~ "5.Weak.Transcription",
        .$State == "E6" ~ "6.Genic.Enhancer",
        .$State == "E7" ~ "7.Enhancer",
        .$State == "E8" ~ "8.ZNC.Finger.Prot.&.Repeats",
        .$State == "E9" ~ "9.Heterochromatin",
        .$State == "E10" ~ "10.Bivalent/Poised.TSS",
        .$State == "E11" ~ "11.Flanking.Bivalent.TSS",
        .$State == "E12" ~ "12.Bivalent.Enhancer",
        .$State == "E13" ~ "13.Repressed.Polycomb",
        .$State == "E14" ~ "14.Weak.Repressed.Polycomb",
        .$State == "E15" ~ "15.Quiescent")))
# Reorder factor levels for plot
scl_annot_df$State_name <- 
    factor(scl_annot_df$State_name,
           levels = levels(scl_annot_df$State_name)[c(1,8:15,2:7)])
ggplot(scl_annot_df,aes(x=State_name,y=`q-value`,fill=Overlap))+
    geom_tile()+theme_cowplot()+scale_fill_viridis_c()+
    theme(axis.text.x = element_text(angle = 45, hjust = 1))+
    xlab("ChromHMM State")+
    ylab("Adj. P-value Bin")

Interestingly, this shows that significant interactions are found morefrequently in the inactive or poised regions of the genome. This may suggest that significant interactions may play more of a role in preventing transcription (NB Need to discuss this further….).

We can rerun this whole analysis excluding MALAT1 and NEAT1 to see if it has any effect:

#E025 Adipose Derived Mesenchymal Stem Cell Cultured Cells - hg38
#from: https://egg2.wustl.edu/roadmap/data/byFileType/chromhmmSegmentations/ChmmModels/coreMarks/jointModel/final/
#This bed file only contains one metadata column so genomation::readBed will fail
#Approach below uses code from in this function
chrHMM_url <- 
  "https://egg2.wustl.edu/roadmap/data/byFileType/chromhmmSegmentations/ChmmModels/coreMarks/jointModel/final/E025_15_coreMarks_hg38lift_segments.bed.gz"
file <- genomation:::compressedAndUrl2temp(chrHMM_url)
#read in as dataframe
df<-readr::read_delim(file,skip=0,col_names=FALSE, delim="\t")
names(df) <- c("chr","start","end","ChromHMM") 
#make GRanges obj
g = GenomicRanges::makeGRangesFromDataFrame(
  df, 
  keep.extra.columns=FALSE, 
  starts.in.df.are.0based=FALSE,
  ignore.strand=is.null(NULL))
col.names1=list(chr=1,start=2,end=3,strand=NULL)
black.names=c("seqnames", "ranges", "strand", "seqlevels", "seqlengths",
              "isCircular", "start", "end", "width", "element")
my.mcols = df[,-unlist(col.names1),drop=FALSE]
#add meta data column
GenomicRanges::mcols(g) = my.mcols[, !colnames(my.mcols) %in% black.names]
#split GRange object by the different names
chrHMM_list <- GenomicRanges::split(g, g$ChromHMM, drop = TRUE)
#get state names for IDs
sort(unique(df$ChromHMM))

setnames(res,c("target.chr","target.start","target.end"),
         c("chr","start","end"))
#make GRanges
gres = GenomicRanges::makeGRangesFromDataFrame(
  res[!(bait_name %in% c("MALAT1","NEAT1"))], 
  keep.extra.columns=TRUE, 
  starts.in.df.are.0based=FALSE,
  ignore.strand=TRUE)
#now split based on q-value bins
gres_list <- GenomicRanges::split(gres, gres$q_bin, drop = TRUE)
rm(gres)
#get annotations
annotation <- 
  genomation::annotateWithFeatures(gres_list, chrHMM_list)

Now create the two plots:

#don't produce heatmap, take matrix, standardise and replot
annot_mtrx <- genomation::heatTargetAnnotation(annotation,plot=FALSE)
#change column order
col.order <- paste0("E",1:15)
annot_mtrx <- annot_mtrx[,col.order]

#plot raw first
#convert to df
annot_df<-reshape2::melt(annot_mtrx)
names(annot_df) <- c("q-value","State","Overlap")
#Add better explaining state names
annot_df <- annot_df%>% 
    mutate(State_name = as.factor(case_when(
        .$State == "E1" ~ "1.Active.TSS",
        .$State == "E2" ~ "2.Flanking.Active.TSS",
        .$State == "E3" ~ "3.Flanking.Transcribed.State",
        .$State == "E4" ~ "4.Strong.Transcription",
        .$State == "E5" ~ "5.Weak.Transcription",
        .$State == "E6" ~ "6.Genic.Enhancer",
        .$State == "E7" ~ "7.Enhancer",
        .$State == "E8" ~ "8.ZNC.Finger.Prot.&.Repeats",
        .$State == "E9" ~ "9.Heterochromatin",
        .$State == "E10" ~ "10.Bivalent/Poised.TSS",
        .$State == "E11" ~ "11.Flanking.Bivalent.TSS",
        .$State == "E12" ~ "12.Bivalent.Enhancer",
        .$State == "E13" ~ "13.Repressed.Polycomb",
        .$State == "E14" ~ "14.Weak.Repressed.Polycomb",
        .$State == "E15" ~ "15.Quiescent")))

# Reorder factor levels for plot
annot_df$State_name <- 
    factor(annot_df$State_name,
            levels = levels(annot_df$State_name)[c(1,8:15,2:7)])
ggplot(annot_df,aes(x=State_name,y=`q-value`,fill=Overlap))+
    geom_tile()+theme_cowplot()+scale_fill_viridis_c()+
    theme(axis.text.x = element_text(angle = 45, hjust = 1))+
    xlab("ChromHMM State")+
    ylab("Adj. P-value Bin")

#try standardising across states
scl_annot_mtrx <- apply(annot_mtrx,2,function(x) scale(x)[,1])
#plot
scl_annot_df<-reshape2::melt(scl_annot_mtrx)
names(scl_annot_df) <- c("q-value","State","Overlap")
#Add better explaining state names
scl_annot_df <- scl_annot_df%>% 
    mutate(State_name = as.factor(case_when(
        .$State == "E1" ~ "1.Active.TSS",
        .$State == "E2" ~ "2.Flanking.Active.TSS",
        .$State == "E3" ~ "3.Flanking.Transcribed.State",
        .$State == "E4" ~ "4.Strong.Transcription",
        .$State == "E5" ~ "5.Weak.Transcription",
        .$State == "E6" ~ "6.Genic.Enhancer",
        .$State == "E7" ~ "7.Enhancer",
        .$State == "E8" ~ "8.ZNC.Finger.Prot.&.Repeats",
        .$State == "E9" ~ "9.Heterochromatin",
        .$State == "E10" ~ "10.Bivalent/Poised.TSS",
        .$State == "E11" ~ "11.Flanking.Bivalent.TSS",
        .$State == "E12" ~ "12.Bivalent.Enhancer",
        .$State == "E13" ~ "13.Repressed.Polycomb",
        .$State == "E14" ~ "14.Weak.Repressed.Polycomb",
        .$State == "E15" ~ "15.Quiescent")))
# Reorder factor levels for plot
scl_annot_df$State_name <- 
    factor(scl_annot_df$State_name,
           levels = levels(scl_annot_df$State_name)[c(1,8:15,2:7)])
ggplot(scl_annot_df,aes(x=State_name,y=`q-value`,fill=Overlap))+
    geom_tile()+theme_cowplot()+scale_fill_viridis_c()+
    theme(axis.text.x = element_text(angle = 45, hjust = 1))+
    xlab("ChromHMM State")+
    ylab("Adj. P-value Bin")

Similar results to including MALAT1/NEAT1, perhaps not as strong enrichment of repressed/inactive states.

Coding Vs Non-coding RNAs

We next wanted to investigate whether different classifications of RNA were found to have significant interactions at different locations in the genome.

Here, we are interested in if the interaction was cis/trans, the distance of the interaction and what was the specific type of RNA. We need to thus do some preprocessing to our results:

#add cis, trans indicator
res[is.na(distance),interaction:="trans"]
res[!is.na(distance),interaction:="cis"]

#add in gene type i.e. coding, non-coding
genes <- unique(res$bait_name)
mart <- useMart("ENSEMBL_MART_ENSEMBL", host = "useast.ensembl.org")
mart <- useDataset("hsapiens_gene_ensembl", mart)
annotLookup <- getBM(
    mart = mart,
    attributes = c(
        "hgnc_symbol",
        "entrezgene_id",
        "ensembl_gene_id",
        "gene_biotype"),
    filter = "hgnc_symbol",
    values = genes,
    uniqueRows=TRUE,
    useCache = FALSE)

annotLookup <- as.data.table(annotLookup)

#gene_biotype
setnames(annotLookup,
            c("bait_name","entrezgene_id","ensembl_gene_id","gene_biotype"))
setkey(res,bait_name)
setkey(annotLookup,bait_name)
res[annotLookup,gene_biotype:= i.gene_biotype]
#clear memory
rm(genes,annotLookup,mart)
#10,004,838 with 1,716,157 without
res_biotyp <- res[!is.na(gene_biotype),]

We have 10 million interactions that have a related RNA type in BiomaRt which can be investigated. These biotype annotations are from ENSEMBL. The annotations are quite low-level so we are going to roll up as follows:

  • Pseudogene: A gene that has homology to known protein-coding genes but contain a frameshift and/or stop codon(s) which disrupts the ORF. Thought to have arisen through duplication followed by loss of function

  • Protein coding: Gene/transcipt that contains an open reading frame (ORF).

  • TR gene: T cell receptor gene that undergoes somatic recombination, annotated in collaboration with IMGT

  • TEC(To be Experimentally Confirmed): Regions with EST clusters that have polyA features that could indicate the presence of protein coding genes. These require experimental validation, either by 5’ RACE or RT-PCR to extend the transcripts, or by confirming expression of the putatively-encoded peptide with specific antibodies.

  • ncRNA: A non-coding gene + Long non-coding RNA (lncRNA) A non-coding gene/transcript >200bp in length

  • Mt_RNA: mitochondrial RNA

  • Ribozyme: ribozymal RNA

  • scRNA: A small conditional RNA (scRNA) is a small RNA molecule or complex (typically less than approximately 100 nt) engineered to interact and change conformation conditionally in response to cognate molecular inputs so as to perform signal transduction in vitro, in situ, or in vivo.(DEF FROM WIKIPEDIA)

  • IG gene: Immunoglobulin gene that undergoes somatic recombination, annotated in collaboration with IMGT http://www.imgt.org/.

res_biotyp[grepl( "protein_coding", gene_biotype),gene_biotype_hl:="pcRNA"]
res_biotyp[grepl( "pseudogene", gene_biotype),gene_biotype_hl:="pseudogene"]
res_biotyp[grepl( "TR_", gene_biotype),gene_biotype_hl:="TR_gene"]
res_biotyp[grepl( "TEC", gene_biotype),gene_biotype_hl:=gene_biotype]
res_biotyp[gene_biotype %in% c("miRNA","miscRNA","piRNA","rRNA","siRNA","snRNA",
                                "snoRNA","tRNA","vaultRNA","lncRNA","misc_RNA",
                                "scaRNA"),
           gene_biotype_hl:="ncRNA"]
res_biotyp[grepl( "Mt_", gene_biotype),gene_biotype_hl:="Mt_RNA"]
res_biotyp[gene_biotype=="ribozyme",gene_biotype_hl:=gene_biotype]
res_biotyp[gene_biotype=="scRNA",gene_biotype_hl:=gene_biotype]
res_biotyp[grepl( "IG_", gene_biotype),gene_biotype_hl:="IG_gene"]
#update naming to pcRNA
res_biotyp[gene_biotype=="protein_coding",gene_biotype:="pcRNA"]

Before we look at any in depth questions, what is the breakdown of our significant interactions by RNA type?

ggplot(res_biotyp[q.value<0.05,],
        aes(x=gene_biotype_hl,fill=gene_biotype_hl)) +
    geom_bar()+
    theme_cowplot()+
    scale_fill_viridis(discrete=T)+
    theme(axis.text.x = element_text(angle = 45, hjust = 1),
            legend.position="none")+
    xlab("")+
    ylab("Number of significant interactions")

And if we exclude MALAT1 and NEAT1

ggplot(res_biotyp[q.value<0.05 & !(bait_name %in% 
                                                  c("MALAT1","NEAT1")),],
        aes(x=gene_biotype_hl,fill=gene_biotype_hl)) +
    geom_bar()+
    theme_cowplot()+
    scale_fill_viridis(discrete=T)+
    theme(axis.text.x = element_text(angle = 45, hjust = 1),
            legend.position="none")+
    xlab("Excluding MALAT1 & NEAT1")+
    ylab("Number of significant interactions")

So excluding MALAT1 and NEAT1 most significant interactions are by protein-coding RNA.

The first thing we wanted to know was whether there was a difference in the type of RNA that are involved in cis or trans interactions:

#plot sig with higher and lower level RNA types
#add proportional counts
res_counts <- res_biotyp[q.value<0.05,.N,by=.(interaction,gene_biotype_hl)]
res_counts[,all_N:=sum(N),by=interaction]
res_counts[,prop:=(N/all_N)]
ggplot(res_counts,aes(x=gene_biotype_hl,y=prop,fill=gene_biotype_hl)) +
    geom_bar(stat="identity")+
    facet_wrap(~interaction)+
    theme_cowplot()+
    scale_fill_viridis(discrete=T)+
    theme(axis.text.x = element_text(angle = 45, hjust = 1),
            legend.position="none")+
    xlab("")+
    ylab("Proportion of significant cis/trans interactions")

It is clear that there is preferential for non-coding RNA to have trans-interactions whereas the opposite is true for protein-coding RNA. We can see which non-coding RNA are typically involved too:

res_counts <- res_biotyp[q.value<0.05,.N,by=.(interaction,gene_biotype)]
res_counts[,all_N:=sum(N),by=interaction]
res_counts[,prop:=(N/all_N)]
ggplot(res_counts, aes(x=gene_biotype,y=prop,fill=gene_biotype)) +
    geom_bar(stat="identity")+
    facet_wrap(~interaction)+
    theme_cowplot()+
    scale_fill_viridis(discrete=T)+
    theme(axis.text.x = element_text(angle = 45, hjust = 1),
          legend.position="none")+
    xlab("")+
    ylab("Proportion of significant cis/trans interactions")

The vast majority are long non-coding RNAs (NB Need to discuss this further….).

Given that most trans interactions are non-coding RNA’s, we wondered whether there was a relationship between the distance of the interaction from where the RNA was transcribed and the RNA type. Since we can’t derive the distance for trans interactions, we will only consider cis:

#finally for cis what is the difference in distance of interaction across
#diff RNA types
#Remove IG TR, only 2,3 interactions respectively
#specify comparisons
ggplot(res_biotyp[q.value<0.05 & interaction=="cis" & 
                      (!gene_biotype_hl %in% c("IG_gene","TR_gene"))], 
       aes(x=gene_biotype_hl,y=log(distance,base=10),fill=gene_biotype_hl)) +
    geom_violin(trim=FALSE)+
    geom_boxplot(width=0.1)+
    theme_cowplot()+
    scale_fill_viridis(discrete=T,alpha=0.5)+
    theme(legend.position="none")+
    ylab("Log 10 Distance from Interaction to RNA")+
    xlab("Interaction Type")+
    #Wilcoxon test  - t-test (non-parametric)
    stat_compare_means(label = "p.signif", method = "wilcox.test",
                       label.y=rep(9.5,3),hide.ns = TRUE,
                       aes(label = ..p.adj..), #adjust for multiple testing
                       ref.group=".all.")#Pairwise comparison against all(mean)

No clear relationship but what if, as before, we look at specific types of non-coding RNA

#snoRNA,misc_RNA removed too low cases - 2,3
#removed pseudogene RNAs too to compare coding and non-coding directly
ggplot(res_biotyp[q.value<0.05 & interaction=="cis" & 
                      (!gene_biotype_hl %in% c("IG_gene","TR_gene",
                                                "pseudogene")) &
                      (!gene_biotype %in% 
                           c("snoRNA","misc_RNA"))], 
       aes(x=gene_biotype,y=log(distance,base=10),fill=gene_biotype)) +
    geom_violin(trim=FALSE)+
    geom_boxplot(width=0.1)+
    theme_cowplot()+
    scale_fill_viridis(discrete=T,alpha=0.5)+
    theme(axis.text.x = element_text(angle = 45, hjust = 1),
            legend.position="none")+
    ylab("Log 10 Distance from Interaction to RNA")+
    xlab("Interaction Type")+
    #Wilcoxon test  - t-test (non-parametric)
    stat_compare_means(label = "p.signif", method = "wilcox.test",
                       label.y=rep(9.5,4),hide.ns = TRUE,
                       aes(label = ..p.adj..), #adjust for multiple testing
                       ref.group=".all.")#Pairwise comparison against all(mean)

miRNA and snRNA seem to be significantly different to the mean interaction distance, occuring at a lot closer distances.

What if we separate out MALAT1 and NEAT1 since they are involved in so many interactions:

#create separate column for MALAT1, NEAT1, gene biotype
res_biotyp[,gene_biotype_excl:=gene_biotype]
#combine all pseudogenes - low numbers
res_biotyp[gene_biotype_hl=="pseudogene",gene_biotype_excl:=gene_biotype_hl]
res_biotyp[gene_biotype_hl=="IG_gene",gene_biotype_excl:=gene_biotype_hl]
res_biotyp[bait_name=="MALAT1",gene_biotype_excl:=bait_name]
res_biotyp[bait_name=="NEAT1",gene_biotype_excl:=bait_name]
#snoRNA,misc_RNA removed too low cases - 2,3
#removed pseudogene RNAs too to compare coding and non-coding directly
ggplot(res_biotyp[q.value<0.05 & interaction=="cis" & 
                      (!gene_biotype_hl %in% c("IG_gene","TR_gene",
                                                "pseudogene")) &
                      (!gene_biotype %in% 
                           c("snoRNA","misc_RNA"))], 
       aes(x=gene_biotype_excl,y=log(distance,base=10),fill=gene_biotype_excl))+
    geom_violin(trim=FALSE)+
    geom_boxplot(width=0.1)+
    theme_cowplot()+
    scale_fill_viridis(discrete=T,alpha=0.5)+
    theme(axis.text.x = element_text(angle = 45, hjust = 1),
            legend.position="none")+
    ylab("Log 10 Distance from Interaction to RNA")+
    xlab("Interaction Type")+
    #Wilcoxon test  - t-test (non-parametric)
    stat_compare_means(label = "p.signif", method = "wilcox.test",
                       label.y=rep(9.5,3),hide.ns = TRUE,
                       aes(label = ..p.adj..), #adjust for multiple testing
                       ref.group=".all.")#Pairwise comparison against all(mean)

MALAT1 and NEAT1 seem to only interact at long distances from where they are transcribed.

Now we have this lower level of RNA states we can also view the number of significant interactions for each:

ggplot(res_biotyp[q.value<0.05,],
        aes(x=gene_biotype_excl,fill=gene_biotype_excl)) +
    geom_bar()+
    theme_cowplot()+
    scale_fill_viridis(discrete=T)+
    theme(axis.text.x = element_text(angle = 45, hjust = 1),
            legend.position="none")+
    xlab("")+
    ylab("Number of significant interactions")

And without MALAT1 and NEAT1:

ggplot(res_biotyp[q.value<0.05 & !(bait_name %in% 
                                                  c("MALAT1","NEAT1")),],
        aes(x=gene_biotype_excl,fill=gene_biotype_excl)) +
    geom_bar()+
    theme_cowplot()+
    scale_fill_viridis(discrete=T)+
    theme(axis.text.x = element_text(angle = 45, hjust = 1),
            legend.position="none")+
    xlab("")+
    ylab("Number of significant interactions")

What if we look at these distances normalised to the RNA length:

res_biotyp[,bait.length:=bait.end-bait.start]
res_biotyp[,stnd_dist:=distance/bait.length]

ggplot(res_biotyp[q.value<0.05 & interaction=="cis" & 
                      (!gene_biotype_hl %in% c("IG_gene","TR_gene"))], 
       aes(x=gene_biotype_hl,y=log(stnd_dist,base=10),fill=gene_biotype_hl)) +
    geom_violin(trim=FALSE)+
    geom_boxplot(width=0.1)+
    theme_cowplot()+
    scale_fill_viridis(discrete=T,alpha=0.5)+
    theme(legend.position="none")+
    ylab("Log 10 Distance from Interaction to RNA")+
    xlab("Interaction Type")+
    #Wilcoxon test  - t-test (non-parametric)
    stat_compare_means(label = "p.signif", method = "wilcox.test",
                       label.y=rep(9.5,3),hide.ns = TRUE,
                       aes(label = ..p.adj..), #adjust for multiple testing
                       ref.group=".all.")#Pairwise comparison against all(mean)

Clear difference across the types, let’s look at specific types of non-coding RNA:

#snoRNA,misc_RNA removed too low cases - 2,3
#removed pseudogene RNAs too to compare coding and non-coding directly
ggplot(res_biotyp[q.value<0.05 & interaction=="cis" & 
                      (!gene_biotype_hl %in% c("IG_gene","TR_gene",
                                                "pseudogene")) &
                      (!gene_biotype %in% 
                           c("snoRNA","misc_RNA"))], 
       aes(x=gene_biotype,y=log(stnd_dist,base=10),fill=gene_biotype)) +
    geom_violin(trim=FALSE)+
    geom_boxplot(width=0.1)+
    theme_cowplot()+
    scale_fill_viridis(discrete=T,alpha=0.5)+
    theme(axis.text.x = element_text(angle = 45, hjust = 1),
            legend.position="none")+
    ylab("Log 10 Distance from Interaction to RNA")+
    xlab("Interaction Type")+
    #Wilcoxon test  - t-test (non-parametric)
    stat_compare_means(label = "p.signif", method = "wilcox.test",
                       label.y=rep(7,4),hide.ns = TRUE,
                       aes(label = ..p.adj..), #adjust for multiple testing
                       ref.group=".all.")#Pairwise comparison against all(mean)

Relative to their length, miRNA and snRNA seem to interact at a lot further distances.

What if we separate out MALAT1 and NEAT1 since they are involved in so many interactions:

#snoRNA,misc_RNA removed too low cases - 2,3
#removed pseudogene RNAs too to compare coding and non-coding directly
ggplot(res_biotyp[q.value<0.05 & interaction=="cis" & 
                      (!gene_biotype_hl %in% c("IG_gene","TR_gene",
                                                "pseudogene")) &
                      (!gene_biotype %in% 
                           c("snoRNA","misc_RNA"))], 
       aes(x=gene_biotype_excl,y=log(stnd_dist,base=10),fill=gene_biotype_excl))+
    geom_violin(trim=FALSE)+
    geom_boxplot(width=0.1)+
    theme_cowplot()+
    scale_fill_viridis(discrete=T,alpha=0.5)+
    theme(axis.text.x = element_text(angle = 45, hjust = 1),
            legend.position="none")+
    ylab("Log 10 Distance from Interaction to RNA")+
    xlab("Interaction Type")+
    #Wilcoxon test  - t-test (non-parametric)
    stat_compare_means(label = "p.signif", method = "wilcox.test",
                       label.y=rep(7,3),hide.ns = TRUE,
                       aes(label = ..p.adj..), #adjust for multiple testing
                       ref.group=".all.")#Pairwise comparison against all(mean)

MALAT1 and NEAT1 interact at similar distances to miRNA and snRNA. The lncRNA and protein coding RNA appear to interact at similar distances but if we look at there in isolation and perform a non-parametric t-test between them, does this hold?

#leave just lncRNA and pcRNA - exclude MALAT1 and NEAT1
ggplot(res_biotyp[q.value<0.05 & interaction=="cis" & 
                      gene_biotype_excl %in% c("lncRNA","pcRNA")], 
       aes(x=gene_biotype_excl,y=log(stnd_dist,base=10),fill=gene_biotype_excl))+
    geom_violin(trim=FALSE)+
    geom_boxplot(width=0.1)+
    theme_cowplot()+
    scale_fill_viridis(discrete=T,alpha=0.5)+
    theme(axis.text.x = element_text(angle = 45, hjust = 1),
            legend.position="none")+
    ylab("Log 10 Distance from Interaction to RNA")+
    xlab("Interaction Type")+
    #Wilcoxon test  - t-test (non-parametric)
    stat_compare_means(label = "p.signif", method = "wilcox.test",
                       label.y=rep(5,2),hide.ns = TRUE,
                       aes(label = ..p.adj..), #adjust for multiple testing
                       ref.group="pcRNA")#Pairwise comparison against all(mean)

We can look into these differences further but first, let’s consider the differences between significant cis and trans interactions. There are 34627 significant interactions. Of these, 84.2% are trans whereas 15.8% are cis. Is there a small number of RNAs contributing to all of these cis or trans interactions?

#get the number of interactions per RNA for trans/cis
res_meta <- res[q.value<0.05,list(sum(count),length(unique(bait_name))),
      by=interaction]
res_meta[,prop:=V1/V2]

#now get
ggplot(res_meta, 
       aes(x=interaction,y=prop,fill=interaction)) +
  geom_bar(stat="identity")+
  theme_cowplot()+
  scale_fill_viridis(discrete=T)+
  theme(legend.position="none")+
  ylab("Number of Interactions per RNA")+
  xlab("Interaction Type")

Roughly the same number of interactions per RNA for trans and cis interactions.

What if interactions associated with Malat1 and Neat1 are dropped, how does that affect: vast majority (80%+) of trans interactions are by non-coding RNA; predominately lncRNAs and the majority of cis are by protein coding RNAs (75%+)?

#remove Malat1 and Neat1 and then replot
#plot sig with higher and lower level RNA types
#add proportional counts
res_counts_excl <- res_biotyp[q.value<0.05 & !(bait_name %in% 
                                                  c("MALAT1","NEAT1")),
                                .N,by=.(interaction,gene_biotype_hl)]
res_counts_excl[,all_N:=sum(N),by=interaction]
res_counts_excl[,prop:=(N/all_N)]
ggplot(res_counts_excl,aes(x=gene_biotype_hl,y=prop,fill=gene_biotype_hl)) +
  geom_bar(stat="identity")+
  facet_wrap(~interaction)+
  theme_cowplot()+
  scale_fill_viridis(discrete=T)+
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position="none")+
  xlab("RNA types - Excl MALAT1, NEAT1")+
  ylab("Proportion of significant cis/trans interactions")

So when MALAT1 and NEAT1 are removed, both cis and trans interactions appear to have the same proportions of RNA types with preferential binding. Does this hold for the RNA subtypes?

res_counts_excl <- res_biotyp[q.value<0.05 & !(bait_name %in% 
                                                  c("MALAT1","NEAT1")),
                                .N,by=.(interaction,gene_biotype)]
res_counts_excl[,all_N:=sum(N),by=interaction]
res_counts_excl[,prop:=(N/all_N)]
ggplot(res_counts_excl, aes(x=gene_biotype,y=prop,fill=gene_biotype)) +
  geom_bar(stat="identity")+
  facet_wrap(~interaction)+
  theme_cowplot()+
  scale_fill_viridis(discrete=T)+
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position="none")+
  xlab("RNA types - Excl MALAT1, NEAT1")+
  ylab("Proportion of significant cis/trans interactions")

Again the proportions appear identical when MALAT1 and NEAT1 are excluded.

ChromHMM & Coding, Non-coding

Let’s look at the significant coding non-coding groups of RNA fromt he last violin plot across the different ChromHMM states to see if there is any difference in where they interact.

First, derive annotation data:

#E025 Adipose Derived Mesenchymal Stem Cell Cultured Cells - hg38
#from: https://egg2.wustl.edu/roadmap/data/byFileType/chromhmmSegmentations/ChmmModels/coreMarks/jointModel/final/
#This bed file only contains one metadata column so genomation::readBed will fail
#Approach below uses code from in this function
chrHMM_url <- 
  "https://egg2.wustl.edu/roadmap/data/byFileType/chromhmmSegmentations/ChmmModels/coreMarks/jointModel/final/E025_15_coreMarks_hg38lift_segments.bed.gz"
file <- genomation:::compressedAndUrl2temp(chrHMM_url)
#read in as dataframe
df<-readr::read_delim(file,skip=0,col_names=FALSE, delim="\t")
names(df) <- c("chr","start","end","ChromHMM") 
#make GRanges obj
g = GenomicRanges::makeGRangesFromDataFrame(
  df, 
  keep.extra.columns=FALSE, 
  starts.in.df.are.0based=FALSE,
  ignore.strand=is.null(NULL))
col.names1=list(chr=1,start=2,end=3,strand=NULL)
black.names=c("seqnames", "ranges", "strand", "seqlevels", "seqlengths",
              "isCircular", "start", "end", "width", "element")
my.mcols = df[,-unlist(col.names1),drop=FALSE]
#add meta data column
GenomicRanges::mcols(g) = my.mcols[, !colnames(my.mcols) %in% black.names]
#split GRange object by the different names
chrHMM_list <- GenomicRanges::split(g, g$ChromHMM, drop = TRUE)
#get state names for IDs
sort(unique(df$ChromHMM))

#set up results with coding non-coding info
#RNA types with less than 10 interactions removed
res_biotyp[q.value<0.05,.N,by=gene_biotype_excl][N<=10]$gene_biotype_excl
#[1] "IG_gene" "misc_RNA" "snoRNA" "TR_J_gene"  "TR_V_gene"
#removed pseudogene RNAs too to compare coding and non-coding directly
sig_res_rna_typ <-
  res_biotyp[q.value<0.05 & 
               !(gene_biotype_excl %in% c("IG_gene","misc_RNA","snoRNA",
                                          "TR_J_gene","TR_V_gene"))]
setnames(sig_res_rna_typ,c("target.chr","target.start","target.end"),
         c("chr","start","end"))
#make GRanges
gres = GenomicRanges::makeGRangesFromDataFrame(
  sig_res_rna_typ, 
  keep.extra.columns=TRUE, 
  starts.in.df.are.0based=FALSE,
  ignore.strand=TRUE)
#now split based on q-value bins
gres_list <- GenomicRanges::split(gres, gres$gene_biotype_excl, drop = TRUE)
rm(gres)
#get annotations
annotation <- 
  genomation::annotateWithFeatures(gres_list, chrHMM_list)

Now create the plot:

#don't produce heatmap, take matrix, standardise and replot
annot_mtrx <- genomation::heatTargetAnnotation(annotation,plot=FALSE)
#change column order
col.order <- paste0("E",1:15)
annot_mtrx <- annot_mtrx[,col.order]

#plot raw first
#convert to df
annot_df<-reshape2::melt(annot_mtrx)
names(annot_df) <- c("RNA Type","State","Overlap")
#Add better explaining state names
annot_df <- annot_df%>% 
    mutate(State_name = as.factor(case_when(
        .$State == "E1" ~ "1.Active.TSS",
        .$State == "E2" ~ "2.Flanking.Active.TSS",
        .$State == "E3" ~ "3.Flanking.Transcribed.State",
        .$State == "E4" ~ "4.Strong.Transcription",
        .$State == "E5" ~ "5.Weak.Transcription",
        .$State == "E6" ~ "6.Genic.Enhancer",
        .$State == "E7" ~ "7.Enhancer",
        .$State == "E8" ~ "8.ZNC.Finger.Prot.&.Repeats",
        .$State == "E9" ~ "9.Heterochromatin",
        .$State == "E10" ~ "10.Bivalent/Poised.TSS",
        .$State == "E11" ~ "11.Flanking.Bivalent.TSS",
        .$State == "E12" ~ "12.Bivalent.Enhancer",
        .$State == "E13" ~ "13.Repressed.Polycomb",
        .$State == "E14" ~ "14.Weak.Repressed.Polycomb",
        .$State == "E15" ~ "15.Quiescent")))

# Reorder factor levels for plot
annot_df$State_name <- 
    factor(annot_df$State_name,
            levels = levels(annot_df$State_name)[c(1,8:15,2:7)])
ggplot(annot_df,aes(x=State_name,y=`RNA Type`,fill=Overlap))+
    geom_tile()+theme_cowplot()+scale_fill_viridis_c()+
    theme(axis.text.x = element_text(angle = 45, hjust = 1))+
    xlab("ChromHMM State")+
    ylab("RNA Type")

All more generally bind in the active states (1-7) with the most prevalent overlap in the state 1: Active TSS. More thoughts:

  • Interesting NEAT1 and MALAT1 don’t bind to flanking transcribed state at all while all others do.
  • Pseudogenes seem to bind less frequently in states 2-7 than ncRNA and pcRNA.
  • The pcRNA and ncRNA seem to have significant overlap with the quiescent state that isn’t present in the other inactive stages (8-14) or in the pseudogenes.
  • It is also notable that miRNA bind preferentially to enhancers, noted in (literature)[https://pubmed.ncbi.nlm.nih.gov/28639195/]

Now we want to understand what interactions between the RNA types and specific states are significant. To do this, we performed a chi-squared test. We wanted to avoid, in the first place, running individual tests for each state-RNA interaction overlap score. So we started with an overall difference in distribution test (chi-square) and then did multiple-testing corrected post-hoc comparisons.

Reminder - Chi-Square test tests for a relationship between two categorical variables, in our case RNA Type and ChromHMM state. Null hypothesis is there is no relationship.

#convert back to nxm matrix
annot_mtrx <- 
  reshape2::dcast(annot_df, `RNA Type` ~ State_name,value.var = "Overlap")
# RNA Type given own column in matrix, change to row names
rownames(annot_mtrx) <- annot_mtrx[,"RNA Type"]
annot_mtrx <- annot_mtrx[,-1]
chisq <- chisq.test(annot_mtrx, correct = TRUE)
chisq
#> 
#>  Pearson's Chi-squared test
#> 
#> data:  annot_mtrx
#> X-squared = 271.14, df = 84, p-value < 2.2e-16

So there is a relationship between RNA type and ChromHMM state. Let’s look at the significance of specific interactions: